In [1]:
# Computations
import numpy as np
import pandas as pd

# scipy
import scipy.stats as stats

# sklearn
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import KFold

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.ensemble import VotingClassifier

# Visualisation libraries
import seaborn as sns

import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec

import missingno as msno

import plotly.offline as py
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot 
from plotly.subplots import make_subplots
from wordcloud import WordCloud
import re
# Graphics in retina format 
%config InlineBackend.figure_format = 'retina' 

# sns setting
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("whitegrid")

# plt setting
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

Pima Indians Diabetes Data Classification using Feature Importance

In this article, we use Kaggle'sPima Indians Diabetes. The Pima indians are a group of Native Americans living in an area consisting of what is now central and southern Arizona. A variety of statistical methods are used here for predictions.

Context

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

Content

The datasets consist of several medical predictor variables and one target variable, Outcome. Predictor variables include the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.

Citations

Table of contents

Dataset Analysis

In [2]:
Data = pd.read_csv('pima-indians-diabetes-database/diabetes.csv')
display(Data.head())

print('The Dataset Shape: %i rows and %i columns' % Data.shape)
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
The Dataset Shape: 768 rows and 9 columns
Feature Explanations
Pregnancies Number of times pregnant
Glucose Plasma glucose concentration a 2 hours in an oral glucose tolerance test
BloodPressure Diastolic blood pressure (mm Hg)
SkinThickness Triceps skinfold thickness (mm)
Insulin 2-Hour serum insulin (mu U/ml)
BMI Body mass index (weight in kg/(height in m)^2)
DiabetesPedigreeFunction Diabetes pedigree function
Age Age (years)
Outcome Whether or not a patient has diabetes
In [3]:
def Data_info(Inp, Only_NaN = False):
    Out = pd.DataFrame(Inp.dtypes,columns=['Data Type']).sort_values(by=['Data Type'])
    Out = Out.join(pd.DataFrame(Inp.isnull().sum(), columns=['Number of NaN Values']), how='outer')
    Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
    if Only_NaN:
        Out = Out.loc[Out['Number of NaN Values']>0]
    return Out
display(Data_info(Data).T[:2])
_ = msno.bar(Data, figsize=(12,3), fontsize=14, log=False, color="#34495e")
display(Data.describe())
Age BMI BloodPressure DiabetesPedigreeFunction Glucose Insulin Outcome Pregnancies SkinThickness
Data Type int64 float64 int64 float64 int64 int64 int64 int64 int64
Number of NaN Values 0 0 0 0 0 0 0 0 0
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578 0.471876 33.240885 0.348958
std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160 0.331329 11.760232 0.476951
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.078000 21.000000 0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 0.243750 24.000000 0.000000
50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 0.372500 29.000000 0.000000
75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 0.626250 41.000000 1.000000
max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 2.420000 81.000000 1.000000

Let's take a close look at our data.

In [4]:
fig, ax = plt.subplots(nrows=4, ncols=2, figsize = (16, 20))
for i in range(len(Data.columns[:-1])):
    sns.distplot(Data.iloc[:,i], rug=True, rug_kws={"color": "red"},
                 kde_kws={"color": "k", "lw": 2, "label": "KDE"},
                 hist_kws={"histtype": "step", "linewidth": 2,
                           "alpha": 1, "color": "Navy"}, ax= ax[int(i/2),i%2])
    if Data.iloc[:,i].name != 'BMI':
        ax[int(i/2),i%2].set_xlabel(re.sub(r"(\w)([A-Z])", r"\1 \2", Data.iloc[:,i].name))
In [5]:
Temp = ['Non-Diabetic' if x==0 else 'Diabetic' for x in Data['Outcome']]

fig = go.Figure(data=go.Splom(dimensions=[dict(label='Pregnancies', values=Data['Pregnancies']),
                              dict(label='Glucose', values=Data['Glucose']),
                              dict(label='Blood<br>Pressure', values=Data['BloodPressure']),
                              dict(label='Skin<br>Thickness', values=Data['SkinThickness']),
                              dict(label='Insulin', values=Data['Insulin']),
                              dict(label='BMI', values=Data['BMI']),
                              dict(label='Diabetes<br>Pedigree<br>Fun', values=Data['DiabetesPedigreeFunction']),
                              dict(label='Age', values=Data['Age'])],
                              showupperhalf=False,
                              marker=dict(color=Data['Outcome'], size=4, colorscale='Bluered',
                              line=dict(width=0.4, color='black')),
                              text=Temp, diagonal=dict(visible=False)))
del Temp
fig.update_layout(title='Scatterplot Matrix', dragmode='select',
                  width=900, height=900, hovermode='closest')
fig.show()

As can be seen, the Data has a normal distribution, and some entries need to be adjusted. In doing so, we defined a normalizer as follows, for a given vector $x$,

\begin{align*} \text{Normalizer}(x, cut) = \begin{cases} x_i &\mbox{if } |x_i- \mu|<\sigma\times cut \\ mode(x) & \mbox{else} \end{cases}. \end{align*}
In [6]:
def Normalizer(Col, cut = 3):
    return Col[(Col > (Col.mean() - Col.std() * cut)) &
               (Col < (Col.mean() + Col.std() * cut))]

fig, ax = plt.subplots(nrows=4, ncols=2, figsize = (16, 20))
Temp = Data.copy()
for i in range(len(Data.columns[:-1])):
    Data[Data.columns[i]] = Normalizer(Data[Data.columns[i]])
    Data[Data.columns[i]] = Data[Data.columns[i]].fillna(Data[Data.columns[i]].dropna().mode()[0])
    # Sub-Plots
    sns.distplot(Data.iloc[:,i], rug=True, rug_kws={"color": "red"},
                 kde_kws={"color": "k", "lw": 2, "label": "KDE"},
                 hist_kws={"histtype": "step", "linewidth": 2,
                           "alpha": 1, "color": "Navy"}, ax= ax[int(i/2),i%2])
    if Data.iloc[:,i].name != 'BMI':
        ax[int(i/2),i%2].set_xlabel(re.sub(r"(\w)([A-Z])", r"\1 \2", Data.iloc[:,i].name))

Basically, we diminished the influence of certain data points (see the following figure).

In [7]:
Temp0 = Temp.copy()
Temp0.iloc[:,:-1] = abs(Data.iloc[:,:-1] - Temp.iloc[:,:-1])

Temp = ['Non-Diabetic' if x==0 else 'Diabetic' for x in Temp0['Outcome']]

fig = go.Figure(data=go.Splom(dimensions=[dict(label='Pregnancies', values=Temp0['Pregnancies']),
                              dict(label='Glucose', values=Temp0['Glucose']),
                              dict(label='Blood<br>Pressure', values=Temp0['BloodPressure']),
                              dict(label='Skin<br>Thickness', values=Temp0['SkinThickness']),
                              dict(label='Insulin', values=Temp0['Insulin']),
                              dict(label='BMI', values=Temp0['BMI']),
                              dict(label='Diabetes<br>Pedigree<br>Fun', values=Temp0['DiabetesPedigreeFunction']),
                              dict(label='Age', values=Temp0['Age'])],
                              showupperhalf=False,
                              marker=dict(color=Temp0['Outcome'], size=4, colorscale='Bluered',
                              line=dict(width=0.4, color='black')),
                              text=Temp, diagonal=dict(visible=False)))
del Temp, Temp0
fig.update_layout(title='Scatterplot Matrix', dragmode='select',
                  width=900, height=900, hovermode='closest')
fig.show()

Data Correlation

In [8]:
def Correlation_Plot (Df,Fig_Size):
    Correlation_Matrix = Df.corr()
    mask = np.zeros_like(Correlation_Matrix)
    mask[np.triu_indices_from(mask)] = True
    for i in range(len(mask)):
        mask[i,i]=0
    Fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
    sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True, 
                cmap =sns.color_palette("RdYlGn", n_colors=10), linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": .7})
    bottom, top = ax.get_ylim()

Correlation_Plot (Data, 9)
In [9]:
Temp = Data.iloc[:,:-1].var().sort_values(ascending = False).to_frame(name= 'Variance')
display(Temp)
Temp0 = Data.corr()
Temp0.loc[Temp.index[-1]].sort_values().to_frame(name= 'Correlation')[:-1].T
Variance
Insulin 7844.510917
Glucose 929.680350
SkinThickness 246.979708
BloodPressure 146.573540
Age 128.991301
BMI 43.941176
Pregnancies 10.734190
DiabetesPedigreeFunction 0.078702
Out[9]:
Pregnancies BloodPressure Age Glucose BMI SkinThickness Insulin Outcome
Correlation 0.015703 0.034428 0.066525 0.095686 0.122868 0.15229 0.184028 0.192156

Even though the variance of Diabetes Pedigree Function is low, this might not improve the performance of the model, the correlation of this feature with the reset of features, especially with the Outcome, is noticeable.

Modeling and Classification

In [10]:
Target = 'Outcome'

X = Data.drop(columns = [Target])
y = Data[Target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
               'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
Out[10]:
Set X_train X_test y_train y_test
Shape (537, 8) (231, 8) (537,) (231,)

Furthermore, we would like to standardize features by removing the mean and scaling to unit variance.

In [11]:
scaler = StandardScaler()

X_train_STD = scaler.fit_transform(X_train)
X_test_STD = scaler.transform(X_test)

X_train_STD = pd.DataFrame(data = X_train_STD, columns = X_train.columns)
X_test_STD = pd.DataFrame(data = X_test_STD, columns = X_test.columns)

Some other functions that we will be using later.

In [12]:
def Performance(clf, X_test = X_test_STD):
    df = pd.DataFrame()
    y_pred = clf.predict(X_test)
    df = df.append({'Score': clf.score(X_test, y_test),
                    'F1 Score': f1_score(y_test.values, y_pred, average= 'weighted'),
                    'Precision Score': precision_score(y_test.values, y_pred, average= 'weighted'),
                    'Recall Score':  recall_score(y_test.values, y_pred, average= 'weighted')}, ignore_index=True)
    display(df.style.hide_index())

def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: SpringGreen' if v else '' for v in is_max]


def Feature_Ranking(clf):
    df = pd.DataFrame()
    for n in range(2, X.shape[1]):
        selector = RFE(estimator= clf, n_features_to_select=n, verbose=0)
        selector.fit(X_train_STD, y_train)
        df = df.append({'Number of Features to Select': n,
                        'Score':metrics.accuracy_score(y_test, selector.predict(X_test_STD)),
                        'Features': X.columns[selector.support_].tolist(),
                        'Best Features':X.columns[selector.ranking_ == 1].tolist()}, ignore_index=True)

    df = df[['Number of Features to Select', 'Score', 'Features', 'Best Features']]
    display(df.style.apply(highlight_max, subset=['Score']))
    return df.loc[df.Score == df.Score.max(), 'Features'].values[0]

def ROC_Curve(clf, X_test = X_test_STD):
    # false positive rates, true positive rates and thresholds
    fpr, tpr, threshold = metrics.roc_curve(y_test, clf.predict_proba(X_test)[:,1])

    fig, ax = plt.subplots(1, 1, figsize=(5.5, 5.5))
    _ = ax.plot(fpr, tpr, lw=2, label = 'AUC = %0.2f' % metrics.auc(fpr, tpr))
    _ = ax.plot([0, 1], [0, 1],'r--', lw=2)
    _ = ax.legend(loc = 'lower right', fontsize = 14)
    _ = ax.set_xlim([0,1])
    # _ = ax.set_ylim([0,1])
    _ = ax.set_xlabel('False Positive Rate (FPR)')
    _ = ax.set_ylabel('True Positive Rate (TPR)')

Decision Tree Classifier

The first classifier that we use here is Decision Tree Classifier.

In [13]:
dtc = DecisionTreeClassifier()
_ = dtc.fit(X_train_STD,y_train)
Performance(dtc)
ROC_Curve(dtc)
F1 Score Precision Score Recall Score Score
0.690021 0.692140 0.688312 0.688312

However, we could also use RFE from sklearn.feature_selection. This provides feature ranking with recursive feature elimination.

In [14]:
Best_Features = Feature_Ranking(dtc)
Number of Features to Select Score Features Best Features
0 2.000000 0.675325 ['Glucose', 'BMI'] ['Glucose', 'BMI']
1 3.000000 0.658009 ['Glucose', 'BMI', 'DiabetesPedigreeFunction'] ['Glucose', 'BMI', 'DiabetesPedigreeFunction']
2 4.000000 0.688312 ['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age'] ['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age']
3 5.000000 0.701299 ['Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction', 'Age'] ['Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction', 'Age']
4 6.000000 0.675325 ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'DiabetesPedigreeFunction', 'Age'] ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'DiabetesPedigreeFunction', 'Age']
5 7.000000 0.705628 ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'] ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

Thus, we could use only fewer features and improve the results of classifications. The best features for the classifications are

In [15]:
print(Best_Features)
['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
In [16]:
dtc = DecisionTreeClassifier()
_ = dtc.fit(X_train_STD[Best_Features],y_train)
Performance(dtc, X_test_STD[Best_Features])
ROC_Curve(dtc, X_test_STD[Best_Features])
F1 Score Precision Score Recall Score Score
0.698631 0.700710 0.696970 0.696970

RFE can be very useful, especially for cases that the number of features is a large number.

Random Forest Classifier

For more details regarding, please see this link

In [17]:
rfc = RandomForestClassifier()
_ = rfc.fit(X_train_STD,y_train)
Performance(rfc)
ROC_Curve(rfc)
F1 Score Precision Score Recall Score Score
0.749631 0.750458 0.748918 0.748918
In [18]:
Best_Features = Feature_Ranking(rfc)
Number of Features to Select Score Features Best Features
0 2.000000 0.718615 ['Glucose', 'BMI'] ['Glucose', 'BMI']
1 3.000000 0.709957 ['Glucose', 'BMI', 'Age'] ['Glucose', 'BMI', 'Age']
2 4.000000 0.740260 ['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age'] ['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age']
3 5.000000 0.744589 ['Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction', 'Age'] ['Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction', 'Age']
4 6.000000 0.744589 ['Pregnancies', 'Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction', 'Age'] ['Pregnancies', 'Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction', 'Age']
5 7.000000 0.748918 ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'DiabetesPedigreeFunction', 'Age'] ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'DiabetesPedigreeFunction', 'Age']

Thus, we could use only fewer features and improve the results of classifications. The best features for the classifications are

In [19]:
print(Best_Features)
['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'DiabetesPedigreeFunction', 'Age']
In [20]:
rfc = RandomForestClassifier()
_ = rfc.fit(X_train_STD[Best_Features],y_train)
Performance(rfc, X_test_STD[Best_Features])
ROC_Curve(rfc, X_test_STD[Best_Features])
F1 Score Precision Score Recall Score Score
0.750911 0.753925 0.748918 0.748918

It can be seen overall, Random Forest Classifier performed better in this example. Furthermore, using RFE improves the accuracy of this classification.